import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from textblob import TextBlob # For sentiment analysis
data = pd.read_csv('netflix_titles.csv')
data.shape
(8807, 12)
data.columns
Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
'release_year', 'rating', 'duration', 'listed_in', 'description'],
dtype='object')
data.head(2)
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | s1 | Movie | Dick Johnson Is Dead | Kirsten Johnson | NaN | United States | September 25, 2021 | 2020 | PG-13 | 90 min | Documentaries | As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable. |
| 1 | s2 | TV Show | Blood & Water | NaN | Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng | South Africa | September 24, 2021 | 2021 | TV-MA | 2 Seasons | International TV Shows, TV Dramas, TV Mysteries | After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth. |
d = data.groupby(['rating']).size().reset_index(name='counts')
piechart = px.pie(d, values = 'counts', names='rating', title='Distribution of rating', color_discrete_sequence=px.colors.qualitative.Set3)
piechart.show()
data['director']=data['director'].fillna('No Director Available')
filtered_director = pd.DataFrame()
filtered_director = data['director'].str.split(',', expand=True).stack()
filtered_director.head(9)
0 0 Kirsten Johnson 1 0 No Director Available 2 0 Julien Leclercq 3 0 No Director Available 4 0 No Director Available 5 0 Mike Flanagan 6 0 Robert Cullen 1 José Luis Ucha 7 0 Haile Gerima dtype: object
filtered_director = filtered_director.to_frame()
filtered_director.columns = ['Director']
directors = filtered_director.groupby(['Director']).size().reset_index(name='Total Content')
directors.head()
| Director | Total Content | |
|---|---|---|
| 0 | Aaron Moorhead | 2 |
| 1 | Aaron Woolf | 1 |
| 2 | Abbas Alibhai Burmawalla | 1 |
| 3 | Abdullah Al Noor | 1 |
| 4 | Abhinav Shiv Tiwari | 1 |
directors=directors[directors.Director != 'No Director Available']
directors = directors.sort_values(by=['Total Content'], ascending=False)
directors_top_5 = directors.head()
directors_top_5
| Director | Total Content | |
|---|---|---|
| 4021 | Rajiv Chilaka | 22 |
| 4068 | Raúl Campos | 18 |
| 261 | Jan Suter | 18 |
| 4652 | Suhas Kadav | 16 |
| 3235 | Marcus Raboy | 16 |
directors_top_5 = directors_top_5.sort_values(by=['Total Content'])
directors_top_5
| Director | Total Content | |
|---|---|---|
| 4652 | Suhas Kadav | 16 |
| 3235 | Marcus Raboy | 16 |
| 4068 | Raúl Campos | 18 |
| 261 | Jan Suter | 18 |
| 4021 | Rajiv Chilaka | 22 |
fig_1 = px.bar(directors_top_5, x='Total Content', y='Director', title='Top 5 Directors on Netflix')
fig_1.show()
data['cast']=data['cast'].fillna('No Cast Present')
filtered_cast = pd.DataFrame()
filtered_cast = data['cast'].str.split(',', expand=True).stack()
filtered_cast.head()
0 0 No Cast Present 1 0 Ama Qamata 1 Khosi Ngema 2 Gail Mabalane 3 Thabang Molaba dtype: object
filtered_cast = filtered_cast.to_frame()
filtered_cast.columns = ['Actor']
actors = filtered_cast.groupby(by=['Actor']).size().reset_index(name='Total Content')
actors.head()
| Actor | Total Content | |
|---|---|---|
| 0 | Jr. | 2 |
| 1 | "Riley" Lakdhar Dridi | 1 |
| 2 | 'Najite Dede | 1 |
| 3 | 2 Chainz | 1 |
| 4 | 2Mex | 1 |
actors = actors[actors.Actor != 'No Cast Present']
actors=actors.sort_values(by=['Total Content'], ascending=False)
actors.head()
| Actor | Total Content | |
|---|---|---|
| 2612 | Anupam Kher | 39 |
| 26941 | Rupa Bhimani | 31 |
| 30303 | Takahiro Sakurai | 30 |
| 15541 | Julie Tejwani | 28 |
| 23624 | Om Puri | 27 |
actors_top_5 = actors.head()
actors_top_5 = actors_top_5.sort_values(by=['Total Content'], ascending=True)
actors_top_5
| Actor | Total Content | |
|---|---|---|
| 23624 | Om Puri | 27 |
| 15541 | Julie Tejwani | 28 |
| 30303 | Takahiro Sakurai | 30 |
| 26941 | Rupa Bhimani | 31 |
| 2612 | Anupam Kher | 39 |
fig_2 = px.bar(actors_top_5, x='Total Content', y='Actor', title='Top 5 Actors on Netflix')
fig_2
df1= data[['type', 'release_year']]
df1.isnull().sum()
type 0 release_year 0 dtype: int64
df2 = df1.groupby(['release_year', 'type']).size().reset_index(name='Total Content')
df2.head()
| release_year | type | Total Content | |
|---|---|---|---|
| 0 | 1925 | TV Show | 1 |
| 1 | 1942 | Movie | 2 |
| 2 | 1943 | Movie | 3 |
| 3 | 1944 | Movie | 3 |
| 4 | 1945 | Movie | 3 |
df2 = df2[df2['release_year'] > 2010]
fig_3 = px.line(df2, x='release_year', y='Total Content', title='Trend of Content Produced in the Years', color='type')
fig_3
df_1 = data[['release_year', 'description']]
df_1 = df_1.rename(columns={'release_year':'Release Year'})
for index, row in df_1.iterrows():
d=row['description']
testimonial=TextBlob(d)
s=testimonial.sentiment.polarity
if s==0:
sent = 'Neutral'
elif s>0:
sent = 'Positive'
else:
sent = 'Negative'
df_1.loc[[index, 2], 'Sentiment']=sent
df_1 = df_1.groupby(['Release Year', 'Sentiment']).size().reset_index(name='Total Content')
df_1 = df_1[df_1['Release Year'] >= 2010]
fig_4 = px.bar(df_1, x='Release Year', y='Total Content', color='Sentiment', title='Sentiment Content on Netflix')
fig_4
!jupyter nbconvert --to html DA_portfolio_project_11.ipynb
[NbConvertApp] Converting notebook DA_portfolio_project_11.ipynb to html [NbConvertApp] Writing 4384392 bytes to DA_portfolio_project_11.html